In [1]:
from __future__ import division
from IPython.display import display
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import random, sys, os, re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import RandomizedSearchCV, GridSearchCV
from sklearn.cross_validation import cross_val_predict, permutation_test_score
In [2]:
SEED = 97
scale = False
minmax = False
norm = False
nointercept = True
engineering = False
N_CLASSES = 2
submission_filename = "../submissions/submission_cosine_similarity.csv"
In [3]:
from load_blood_data import load_blood_data
y_train, X_train = load_blood_data(train=True, SEED = SEED,
scale = scale,
minmax = minmax,
norm = norm,
nointercept = nointercept,
engineering = engineering)
In [4]:
X_train, X_valid, y_train, y_valid = train_test_split(
X_train, y_train, test_size=0.33, random_state=SEED)
In [5]:
from sklearn.metrics import confusion_matrix
y_true = y_valid
scores = []
for how_many in np.arange(50)+1:
y_pred = []
for i in range(len(X_valid)):
top_how_many = cosine_similarity(X_valid.values[i:i+1], X_train.values).argsort()[0][::-1][:how_many]
if np.mean([y_train[ii] for ii in top_how_many]) < 0.51:
y_pred.append(0)
else:
y_pred.append(1)
cm = confusion_matrix(y_true, y_pred, labels=None)
scores.append(round(np.trace(cm)/float(np.sum(cm)),4))
best_k = np.argmax(scores)+1
print("best_k {}".format(best_k))
In [6]:
how_many = best_k
y_pred = []
for i in range(len(X_valid)):
top_how_many = cosine_similarity(X_valid.values[i:i+1], X_train.values).argsort()[0][::-1][:how_many]
if np.mean([y_train[ii] for ii in top_how_many]) < 0.5:
y_pred.append(0)
else:
y_pred.append(1)
In [7]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, y_pred, labels=None)
print cm
try:
from sklearn_utilities import plot_confusion_matrix
except:
import imp, os
util = imp.load_source('sklearn_utilities', os.path.expanduser('~/Dropbox/Python/sklearn_utilities.py'))
from sklearn_utilities import plot_confusion_matrix
plot_confusion_matrix(cm, ['Did not Donate','Donated'])
accuracy = round(np.trace(cm)/float(np.sum(cm)),4)
misclass = 1 - accuracy
print("Accuracy {}, mis-class rate {}".format(accuracy,misclass))
In [8]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score
fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=None)
plt.figure(figsize=(10,6))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.text(x=0.6,y=0.4,s="AUC {:.4f}"\
.format(roc_auc_score(y_true, y_pred, average='macro')),
fontsize=16)
plt.text(x=0.6,y=0.3,s="accuracy {:.2f}%"\
.format(accuracy*100),
fontsize=16)
plt.text(x=0.6,y=0.2,s="LogLoss {:.4f}"\
.format(log_loss(y_true, y_pred)),
fontsize=16)
plt.text(x=0.6,y=0.1,s="f1 {:.4f}"\
.format(f1_score(y_true, y_pred)),
fontsize=16)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.show()
In [35]:
X_train = None; X_valid = None
y_train = None; y_valid = None
In [36]:
from load_blood_data import load_blood_data
y_train, X_train = load_blood_data(train=True, SEED = SEED,
scale = scale,
minmax = minmax,
norm = norm,
nointercept = nointercept,
engineering = engineering)
In [37]:
from load_blood_data import load_blood_data
X_test, IDs = load_blood_data(train=False, SEED = SEED,
scale = scale,
minmax = minmax,
norm = norm,
nointercept = nointercept,
engineering = engineering)
In [38]:
how_many = best_k
y_pred = []
for i in range(len(X_test)):
top_how_many = cosine_similarity(X_test.values[i:i+1], X_train.values).argsort()[0][::-1][:how_many]
if np.mean([y_train[ii] for ii in top_how_many]) < 0.5:
y_pred.append(0)
else:
y_pred.append(1)
In [39]:
donate_probs = [0.65 if x>0 else 1-0.65 for x in y_pred]
In [40]:
print(y_pred[:10])
print(donate_probs[:10])
In [41]:
assert len(IDs)==len(donate_probs)
f = open(submission_filename, "w")
f.write(",Made Donation in March 2007\n")
for ID, prob in zip(IDs, donate_probs):
f.write("{},{}\n".format(ID,prob))
f.close()
In [ ]: